"""
#
# Flow stability for dynamic community detection https://arxiv.org/abs/2101.06131v2
#
# Copyright (C) 2021 Alexandre Bovet <alexandre.bovet@maths.ox.ac.uk>
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.


This script creates a dataframe with all the edges of the APS dataset (edge = co-authorship of a paper)

saves the results as `all_journals_disamb_edges.csv.gz`

"""





import pandas as pd
from itertools import combinations

datadir = '../paper_data/aps/aps-dataset-metadata-2018/'




raise Exception

#%% first get doi and date for each paper

df_doi_dates = pd.read_csv('../data/aps/df_doi_dates.csv.gz', index_col=0)


#%% 
# this is the author name disambiguation of the APS dataset
# available at Supplementary Material at https://doi.org/10.1126/science.aaf5239
# the file is not very well formatted and need some work to be imported properly
df_author_doi = pd.read_csv('../paper_data/aps/SupplementaryData/APS_author2DOI.dat', header=None)

#%% create author dataframe

authors_IDs = []
for ID, df in df_author_doi.groupby('id'):
    
    assert df.author.nunique() == 1
    
    authors_IDs.append((ID, df.author.iloc[0], df.doi.size))

df_authors = pd.DataFrame(data=authors_IDs, columns=['ID','name','num_article'])

df_authors.to_csv('../data/aps/APS_authors.csv')

#%% create edge_list

edge_list = []
for doi, df in df_author_doi.groupby('doi'):
    
    # keeps only article with at least two authors and max 10 authors (as in Sinatra et al.)
    if df.id.size >= 2 and df.id.size <= 10:
        edge_list.extend([(v1, v2, doi) for v1, v2 in combinations(df.id.tolist(),2)])
    
df_edges = pd.DataFrame(data = edge_list, columns=['n1','n2','doi'])
#%%
#cleanup
df_edges.drop(index=df_edges.loc[(df_edges.doi == ' jr')].index, inplace=True)
assert df_edges.doi.str.startswith('10.1103').all()

df_edges = df_edges.merge(df_doi_dates[['doi','date']], on='doi')
df_edges['date'] = pd.to_datetime(df_edges.date)
df_edges.sort_values('date',inplace=True)


df_edges.to_csv('../data/aps/all_journals_disamb_edges.csv.gz')
